In [ ]:
This notebook pulls in data on Nipah RBP entry into CHO-EFNB2 and CHO-EFNB3 cells, filters data, calculates stats, and makes figures¶
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
e2_distances_file = None
func_scores_E2_file = None
func_scores_E3_file = None
filtered_E2_data = None
filtered_E3_data = None
contact_type_plot = None
E2_E3_entry_corr_plot = None
E2_E3_entry_all_muts_plot = None
combined_E2_E3_correlation_plots = None
entry_region_boxplot_plot = None
nipah_config = None
altair_config = None
entropy_file = None
surface = None
In [2]:
# Parameters
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
e2_distances_file = "results/distances/2vsm_distances.csv"
contact_type_plot = "results/images/contact_type_plot.html"
filtered_E2_data = "results/filtered_data/E2_entry_filtered.csv"
filtered_E3_data = "results/filtered_data/E3_entry_filtered.csv"
E2_E3_entry_corr_plot = "results/images/E2_E3_entry_corr_plot.html"
E2_E3_entry_all_muts_plot = "results/images/E2_E3_entry_all_muts_plot.html"
combined_E2_E3_correlation_plots = (
"results/images/combined_E2_E3_correlation_plots.html"
)
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
entry_region_boxplot_plot = "results/images/entry_region_boxplot_plot.html"
surface = "data/custom_analyses_data/surface_exposure.csv"
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import Bio.SeqIO
import yaml
from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
Paths for running notebook interactively¶
In [5]:
if surface is None:
e2_distances_file = "results/distances/2vsm_distances.csv"
func_scores_E2_file = "results/func_effects/averages/CHO_EFNB2_low_func_effects.csv"
func_scores_E3_file = "results/func_effects/averages/CHO_EFNB3_low_func_effects.csv"
filtered_E2_data = "results/filtered_data/E2_entry_filtered.csv"
filtered_E3_data = "results/filtered_data/E3_entry_filtered.csv"
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
entropy_file = "results/entropy/entropy.csv"
surface = "data/custom_analyses_data/surface_exposure.csv"
Read in custom altair theme and import YAML file with parameters¶
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Filter and merge EFNB2 and EFNB3 entry¶
In [7]:
#Import median entry scores from different selections
func_scores_E2 = pd.read_csv(func_scores_E2_file).dropna().round(3) # this removes wildtype observations from df
func_scores_E3 = pd.read_csv(func_scores_E3_file).dropna().round(3) # this removes wildtype observations from df
In [8]:
def num_selections_and_filter(df, name):
def calculate_filtering_mutants(df, name):
print(f"\nThe dataset is: {name}")
# How many selections were performed
max_sels = df["n_selections"].max()
num_sels_cutoff = (max_sels / 2) + 1
# Find total number of possible mutants
total_mut = (532) * 19
# Filter data
filter_test = df[
(df["site"] != 603) & (df["mutant"] != "-") & (df["mutant"] != "*")
]
num_variants_pre_filter = filter_test.shape[0]
print(
f"After filtering stop and gaps, there are {num_variants_pre_filter} mutants which is {(num_variants_pre_filter/total_mut) * 100:.1f}%"
)
filter_test_times_seen = filter_test[
filter_test["times_seen"] >= config["func_times_seen_cutoff"]
]
num_variants_times_seen = filter_test_times_seen.shape[0]
print(
f"After filtering for {config['func_times_seen_cutoff']} times seen, there are {num_variants_times_seen}, which is {(num_variants_times_seen/total_mut) * 100:.1f}%"
)
filter_test_effect_std = filter_test_times_seen[
filter_test_times_seen["effect_std"] <= config["func_std_cutoff"]
]
num_variants_std = filter_test_effect_std.shape[0]
print(
f"After filtering for {config['func_std_cutoff']} std cutoff, there are {num_variants_std}, which is {(num_variants_std/total_mut) * 100 :.1f}%"
)
filter_test_n_selections = filter_test_effect_std[
filter_test_effect_std["n_selections"] >= num_sels_cutoff
]
num_variants_n_selections = filter_test_n_selections.shape[0]
print(
f"After filtering for mutants in in all selections, there are {num_variants_n_selections}, which is {(num_variants_n_selections/total_mut) * 100 :.1f}%"
)
def apply_filters(df, name):
# Now do the main filtering
max_sels = df["n_selections"].max()
num_sels_cutoff = (max_sels / 2) + 1
print(
f"The number of selections a mutant must be observed in is: {num_sels_cutoff}"
)
# The main filtering. Filters site 603 (is a stop codon/end of gene and we don't want those mutants). Also filter out stop mutants and apply filtering from config file
filtered_df = df[
(df["site"] != 603)
& (df["mutant"] != "-")
& (df["mutant"] != "*")
& (df["times_seen"] >= config["func_times_seen_cutoff"])
& (df["effect_std"] <= config["func_std_cutoff"])
& (df["n_selections"] >= num_sels_cutoff)
]
return filtered_df
# Filtering stats
calculate_filtering_mutants(df, name) # call definition above
# Filter
filtered_df = apply_filters(df, name)
# Now write filtered_data to results/
if name == "E2":
filtered_df.to_csv(filtered_E2_data, index=False)
if name == "E3":
filtered_df.to_csv(filtered_E3_data, index=False)
# return filtered dataframe
return filtered_df
# Call the filtering functions
func_scores_E2 = num_selections_and_filter(func_scores_E2, "E2")
func_scores_E3 = num_selections_and_filter(func_scores_E3, "E3")
# make a merged dataframe of ephrin-b2 and ephrin-b3 entry data
def merge_data(df1,df2):
merged_df = pd.merge(
df1,
df2,
on=["site", "mutant", "wildtype"],
how="outer",
suffixes=["_E2", "_E3"],
)
df1["cell_type"] = "CHO-EFNB2"
df2["cell_type"] = "CHO-EFNB3"
concat_df = pd.concat([df1,df2])
return merged_df,concat_df
merged_df,concat_df = merge_data(func_scores_E2,func_scores_E3)
# Show some stats of filtered merged data
stats = merged_df.describe().round(2)
display(stats)
The dataset is: E2 After filtering stop and gaps, there are 10048 mutants which is 99.4% After filtering for 2 times seen, there are 9831, which is 97.3% After filtering for 1 std cutoff, there are 9787, which is 96.8% After filtering for mutants in in all selections, there are 9784, which is 96.8% The number of selections a mutant must be observed in is: 5.0 The dataset is: E3 After filtering stop and gaps, there are 10074 mutants which is 99.7% After filtering for 2 times seen, there are 9891, which is 97.9% After filtering for 1 std cutoff, there are 9776, which is 96.7% After filtering for mutants in in all selections, there are 9679, which is 95.8% The number of selections a mutant must be observed in is: 4.5
| site | effect_E2 | effect_std_E2 | times_seen_E2 | n_selections_E2 | effect_E3 | effect_std_E3 | times_seen_E3 | n_selections_E3 | |
|---|---|---|---|---|---|---|---|---|---|
| count | 9919.00 | 9784.00 | 9784.00 | 9784.00 | 9784.00 | 9679.00 | 9679.00 | 9679.00 | 9679.00 |
| mean | 337.18 | -0.92 | 0.32 | 7.50 | 7.99 | -0.95 | 0.34 | 6.55 | 6.99 |
| std | 153.14 | 1.33 | 0.24 | 4.26 | 0.11 | 1.38 | 0.25 | 3.52 | 0.13 |
| min | 71.00 | -3.49 | 0.00 | 2.00 | 5.00 | -3.59 | 0.00 | 2.00 | 5.00 |
| 25% | 205.00 | -2.15 | 0.15 | 4.88 | 8.00 | -2.34 | 0.16 | 4.57 | 7.00 |
| 50% | 338.00 | -0.30 | 0.30 | 6.62 | 8.00 | -0.26 | 0.32 | 5.86 | 7.00 |
| 75% | 470.00 | 0.15 | 0.47 | 8.88 | 8.00 | 0.15 | 0.49 | 7.57 | 7.00 |
| max | 602.00 | 0.60 | 1.00 | 72.38 | 8.00 | 0.62 | 1.00 | 56.71 | 7.00 |
Stats¶
In [9]:
def calculate_stats(df, name):
print(f"For {name}:")
total_mut = (532) * 19
print(f'There are {total_mut} amino acid mutations possible')
muts_present = df["effect"].shape[0]
fraction_muts = muts_present / total_mut
print(
f"fraction muts present in {name} is {fraction_muts:.2f} {muts_present}/{total_mut}"
)
# Break mutations into bins (negative, neutral, positive) and calculate how many mutants there are
deleterious_muts = df[df["effect"] <= -0.25].shape[0]
neutral_muts = df[(df["effect"] > -0.25) & (df["effect"] < 0.25)].shape[0]
positive_muts = df[df["effect"] > 0.25].shape[0]
frac_bad_muts = deleterious_muts / muts_present
frac_neutral_muts = neutral_muts / muts_present
frac_pos_muts = positive_muts / muts_present
print(
f"The number of deleterious mutants for {name} is {frac_bad_muts:.2f} {deleterious_muts}/{muts_present}"
)
print(
f"The number of neutral mutants for {name} is {frac_neutral_muts:.2f} {neutral_muts}/{muts_present}"
)
print(
f"The number of positive mutants for {name} is {frac_pos_muts:.2f} {positive_muts}/{muts_present}\n"
)
calculate_stats(func_scores_E2, "CHO-EFNB2")
calculate_stats(func_scores_E3, "CHO-EFNB3")
For CHO-EFNB2: There are 10108 amino acid mutations possible fraction muts present in CHO-EFNB2 is 0.97 9784/10108 The number of deleterious mutants for CHO-EFNB2 is 0.52 5086/9784 The number of neutral mutants for CHO-EFNB2 is 0.30 2934/9784 The number of positive mutants for CHO-EFNB2 is 0.18 1750/9784 For CHO-EFNB3: There are 10108 amino acid mutations possible fraction muts present in CHO-EFNB3 is 0.96 9679/10108 The number of deleterious mutants for CHO-EFNB3 is 0.50 4878/9679 The number of neutral mutants for CHO-EFNB3 is 0.34 3251/9679 The number of positive mutants for CHO-EFNB3 is 0.16 1539/9679
How many sites and which sites only have negative entry scores for mutations?¶
In [10]:
def overall_stats_all_neg(df,effect,region=None):
if region is None:
contact_df = df
else:
contact_df = df[df['site'].isin(region)]
filtered_df = contact_df.groupby('site').filter(lambda group: (group[effect] < 0).all())
unique = filtered_df['site'].unique()
print(list(unique))
total_sites = contact_df['site'].unique().shape[0]
subset = filtered_df['site'].unique().shape[0]
fraction = subset/total_sites
percent = fraction * 100
print(f'The total number of sites are: {total_sites}\n')
print(f' The number of sites where all mutants are negative for {effect}: {subset}\n')
print(f' The percent of sites where all mutants are negative for {effect}: {percent:.0f}\n')
return unique
intolerant_sites_E2 = list(overall_stats_all_neg(func_scores_E2,'effect',region=config['dimerization']))
intolerant_sites_E3 = list(overall_stats_all_neg(func_scores_E3,'effect',region=config['dimerization']))
[160, 162, 163, 165, 167, 172, 203, 205, 206, 208, 240, 258, 259, 260, 266, 267, 323, 331] The total number of sites are: 43 The number of sites where all mutants are negative for effect: 18 The percent of sites where all mutants are negative for effect: 42 [162, 163, 206, 208, 240, 258, 259, 260, 266, 584] The total number of sites are: 43 The number of sites where all mutants are negative for effect: 10 The percent of sites where all mutants are negative for effect: 23
In [11]:
def calculate_top_func_scores(df,effect):
percentile_95_effect_E2 = df[effect].quantile(0.999)
cutoff_E2_df_sites = df[df[effect] > percentile_95_effect_E2]
E2_site_cutoff = cutoff_E2_df_sites['site'].unique()
print(f'The sites with the highest functional scores for {effect} are: {list(E2_site_cutoff)}')
calculate_top_func_scores(func_scores_E2,'effect')
calculate_top_func_scores(func_scores_E3,'effect')
The sites with the highest functional scores for effect are: [280, 351, 407, 433, 477, 501, 529, 584] The sites with the highest functional scores for effect are: [90, 183, 315, 335, 358, 378, 418, 420, 544, 554]
Make bubble plots of receptor contact site type (median values per site)¶
In [12]:
def make_bubbleplot_entry_region(df): # Create a bubble plot using Altair for contact site mutants
barrel_ranges = {
"Hydrophobic": config["hydrophobic"],
"Salt Bridges": config["salt_bridges"],
"Hydrogen Bonds": config["h_bond_total"],
"Contact": config["contact_sites"],
"Overall": list(range(71, 602)),
}
custom_order = [
"Hydrophobic",
"Salt Bridges",
"Hydrogen Bonds",
"Contact",
"Overall",
]
empty_chart = []
variant_selector = alt.selection_point(
on="mouseover", empty=False, fields=["site"], value=1
)
for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
agg_means = []
tmp_df = df[df["cell_type"] == selection]
mean_df = tmp_df.groupby("site")[["effect"]].median().reset_index()
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"barrel": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_point()
.encode(
x=alt.X(
"barrel:O",
sort=custom_order,
title='Contact Type',
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"effect",
title="Median Cell Entry",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["barrel", "effect", "site"],
size=alt.condition(
variant_selector, alt.value(100), alt.value(25)
),
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.3)),
).properties(width=config['width'],height=config['height'])
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
)
empty_chart.append(chart)
combined_effect_chart = (
alt.hconcat(*empty_chart)
.resolve_scale(y="shared", x="shared", color="independent")
.add_params(variant_selector)
)
return combined_effect_chart
tmp_img = make_bubbleplot_entry_region(concat_df)
tmp_img.display()
if entry_region_boxplot_plot is not None:
tmp_img.save(contact_type_plot)
Make bubble plots depending on amino acid property¶
In [13]:
def make_bubbleplot_wildtype_prop(df):
barrel_ranges = {
"hydrophobic": list(["A", "V", "L", "I", "M"]),
"aromatic": list(["Y", "W", "F"]),
"positive": list(["K", "R", "H"]),
"negative": list(["E", "D"]),
"hydrophilic": list(["S", "T", "N", "Q"]),
"special": list(["C", "P", "G"]),
}
empty_charts = []
variant_selector = alt.selection_point(
on="mouseover", empty=False, nearest=True, fields=["site"], value=1
)
for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
if selection == "CHO-EFNB2":
effect_name = "EFNB2"
else:
effect_name = "EFNB3"
tmp_df = df[df["cell_type"] == selection]
unique_wildtype_df = tmp_df[["site", "wildtype"]].drop_duplicates()
mean_df = tmp_df.groupby("site")[["effect"]].median().reset_index()
mean_df = pd.merge(mean_df, unique_wildtype_df, on="site", how="left")
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = mean_df[mean_df["wildtype"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"wildtype_class": barrel, "effect": row["effect"], "site": row["site"], "wildtype": row["wildtype"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_point()
.encode(
x=alt.X(
"wildtype_class:O",
title="Wildtype amino acid property",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"effect",
title=f"Median Cell Entry",
axis=alt.Axis(grid=True, tickCount=4),
),
xOffset="random:Q",
tooltip=["wildtype_class", "effect", "site","wildtype"],
size=alt.condition(variant_selector, alt.value(100), alt.value(25)),
color=alt.condition(
variant_selector, alt.value("orange"), alt.value("black")
),
opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.2)),
).properties(width=config['width'],height=config['height'])
.transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())")
)
empty_charts.append(chart)
combined_effect_chart = (
alt.hconcat(*empty_charts)
.resolve_scale(y="shared", x="shared", color="independent")
.add_params(variant_selector)
)
return combined_effect_chart
wildtype_aa_bubble_img = make_bubbleplot_wildtype_prop(concat_df)
wildtype_aa_bubble_img.display()
Plot correlations between E2 and E3 entry¶
In [14]:
# Import distance data
e2_distances = pd.read_csv(e2_distances_file)
distance_df = pd.merge(
merged_df, e2_distances[["site", "distance"]], on="site", how="left"
)
def determine_distance(df):
# Define the conditions
conditions = [
df["distance"] < 4,
(df["distance"] >= 4) & (df["distance"] <= 8),
df["distance"] > 8,
]
# Define the associated values for the conditions
choices = ["contact", "close", "distant"]
# Apply the conditions and choices to the 'E2_contact' column
df["contact"] = np.select(conditions, choices, default="distant")
return df
distance_df = determine_distance(distance_df)
def median_correlation_plot(df, metric):
aggregation = getattr(df.groupby("site")[["effect_E2", "effect_E3"]], metric)
means = aggregation().reset_index()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
means["effect_E2"], means["effect_E3"]
)
display(r_value.round(2))
means = means.rename(
columns={"effect_E2": f"{metric}_E2", "effect_E3": f"{metric}_E3"}
)
contact_sites = df[["site", "contact", "wildtype"]].drop_duplicates()
df_mean = pd.merge(means, contact_sites, on="site", how="left")
chart = (
alt.Chart(df_mean)
.mark_point()
.encode(
x=alt.X(f"{metric}_E2", title="Summed Cell Entry for EFNB2"),
y=alt.Y(f"{metric}_E3", title="Summed Cell Entry for EFNB3"),
tooltip=["site", "wildtype"],
color=alt.Color(
"contact",
scale=alt.Scale(
domain=["contact", "close", "distant"],
range=["#1f4e79", "#ff7f0e", "gray"],
),
legend=alt.Legend(title="RBP Distance to Receptor"),
),
)
)
min_ = int(df_mean[f"{metric}_E2"].min())
max_ = int(df_mean[f"{metric}_E3"].max())
text = (
alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
.mark_text(
align="left",
baseline="top",
dx=0, # Adjust this for position
dy=0, # Adjust this for position
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
plot = chart + text
return plot
E2_E3_plot = median_correlation_plot(distance_df, "sum")
E2_E3_plot.display()
if entry_region_boxplot_plot is not None:
E2_E3_plot.save(E2_E3_entry_corr_plot)
def correlation_plot(df):
df = df.dropna()
slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
df["effect_E2"], df["effect_E3"]
)
display(r_value.round(2))
variant_selector = alt.selection_point(
on="mouseover", empty=False, nearest=True, fields=["contact"], value=1
)
chart = (
alt.Chart(df)
.mark_point(opacity=0.2,size=15)
.encode(
x=alt.X("effect_E2", title="Cell Entry for EFNB2"),
y=alt.Y("effect_E3", title="Cell Entry for EFNB3"),
tooltip=["site", "wildtype", "mutant"],
color=alt.Color(
"contact",
scale=alt.Scale(
domain=["contact", "close", "distant"],
range=["#1f4e79", "#ff7f0e", "gray"],
),
legend=alt.Legend(title="RBP Distance to Receptor"),
),
opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.2)),
).add_params(variant_selector)
)
min_ = int(df['effect_E2'].min())
max_ = int(df['effect_E3'].max())
text = (
alt.Chart({"values": [{"x": min_, "y": max_, "text": f"r = {r_value:.2f}"}]})
.mark_text(
align="left",
baseline="top",
dx=-20, # Adjust this for position
dy=-20, # Adjust this for position
)
.encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
)
plot = chart + text
return plot
tmp_img_corr = correlation_plot(distance_df)
tmp_img_corr.display()
if entry_region_boxplot_plot is not None:
tmp_img_corr.save(E2_E3_entry_all_muts_plot)
if entry_region_boxplot_plot is not None:
(E2_E3_plot | tmp_img_corr).save(combined_E2_E3_correlation_plots)
0.81
0.79
Make boxplot showing median entry by RBP region¶
In [15]:
def make_boxplot_entry_region(df):
barrel_ranges = {
"Stalk": list(range(70, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
#'Receptor Contact': config['contact_sites'],
#"Overall": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
if selection == "CHO-EFNB2":
effect_name = "EFNB2"
else:
effect_name = "EFNB3"
tmp_df = df[df["cell_type"] == selection]
agg_means = []
#tmp_df = tmp_df.groupby('site')['effect'].median().reset_index()
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color='darkgray')
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"effect",
title=f"Cell Entry",
axis=alt.Axis(grid=True, tickCount=4),
),
tooltip=["region", "effect", "site"],
).properties(width=config['width'],height=config['height'])
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="shared", x="shared", color="independent"
)
return combined_effect_chart
entry_region_boxplot = make_boxplot_entry_region(concat_df)
entry_region_boxplot.display()
if entry_region_boxplot_plot is not None:
entry_region_boxplot.save(entry_region_boxplot_plot)
In [16]:
def make_boxplot_entry_site_class(df):
barrel_ranges = {
#"Stalk": list(range(70, 147)),
#"dimer_A": config['dimer_A'],
#"dimer_B": config['dimer_B'],
"Dimerization": config['dimerization'],
#"Dimerization_contact": config['dimer_contact'],
'Receptor Contact': config['contact_sites'],
#'Contact_contact': config['contact_contact'],
"Head": list(range(178, 602)),
}
custom_order = ["Dimerization", "Receptor Contact", "Head"]
empty_charts = []
for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
if selection == "CHO-EFNB2":
effect_name = "EFNB2"
else:
effect_name = "EFNB3"
tmp_df = df[df["cell_type"] == selection]
agg_means = []
#tmp_df = tmp_df.groupby('site')['effect'].median().reset_index()
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
#agg_means_df = agg_means_df.groupby('site')['effect'].median().reset_index()
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color='darkgray')
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"effect",
title=f"Cell Entry",
axis=alt.Axis(grid=True, tickCount=4),
),
tooltip=["region", "effect", "site"],
).properties(width=config['width'],height=config['height'])
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="shared", x="shared", color="independent"
)
return combined_effect_chart
head_region_boxplot = make_boxplot_entry_site_class(concat_df)
head_region_boxplot.display()
Check for potential neutral/beneficial glycosylation sites¶
In [17]:
def find_potential_glycan_sites(df):
filtered = df[df["wildtype"].isin(["T", "S"])]
matching_sites = []
for index, row in filtered.iterrows():
# Check for existence of site two numbers prior with 'N' mutant and positive effect
prior_rows = df[
(df["site"] == row["site"] - 2) & (df["mutant"] == "N") & (df["effect"] > 0)
]
if not prior_rows.empty:
matching_sites.append(row["site"])
unique_list1 = list(set(matching_sites))
unique_list1 = [x - 2 for x in unique_list1]
filtered = df[df["wildtype"].isin(["N"])]
matching_sites = []
for index, row in filtered.iterrows():
# Check for existence of site two numbers prior with 'N' mutant and positive effect
prior_rows = df[
(df["site"] == row["site"] + 2)
& (df["mutant"].isin(["T", "S"]))
& (df["effect"] > 0)
]
if not prior_rows.empty:
matching_sites.append(row["site"])
unique_list2 = list(set(matching_sites))
unique_list = unique_list1 + unique_list2
unique_list.sort()
print(f"The total number of potential PNLG sites are: {len(unique_list)}")
return unique_list
PNLG = find_potential_glycan_sites(func_scores_E3)
surface_df = pd.read_csv(surface)
surface_df.rename(columns={"exposure_A": "Surface Exposure"}, inplace=True)
PNLG_SURFACE = surface_df[surface_df["site"].isin(PNLG)]
PNLG_SURFACE = list(
PNLG_SURFACE[PNLG_SURFACE["Surface Exposure"] >= 25]["site"].unique()
)
print(f"\nThe surface exposed PNLG sites are: {PNLG_SURFACE}\n")
glycans = config["glycans"]
filtered_PNLG_SURFACE = [num for num in PNLG_SURFACE if num not in glycans]
print(filtered_PNLG_SURFACE)
print(len(filtered_PNLG_SURFACE))
The total number of potential PNLG sites are: 29 The surface exposed PNLG sites are: [159, 180, 191, 192, 306, 311, 326, 359, 378, 383, 403, 417, 423, 473, 478, 543, 554, 570, 600] [180, 191, 192, 311, 326, 359, 383, 403, 423, 473, 478, 543, 554, 570, 600] 15
In [18]:
def entry_by_site(df):
tmp_df = df.groupby('site')['effect'].mean().reset_index()
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
agg_means_df['beta_sheet'] = agg_means_df['site'].isin(config['beta_sheet'])
#display(agg_means_df)
chart = (
alt.Chart(agg_means_df)
.mark_bar(opacity=1)
.encode(
x=alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
y=alt.Y("effect", title="Summed Cell Entry for EFNB3"),
tooltip=["site", "effect"],
color=alt.Color('region',sort=custom_order,title='Region'),
)
).properties(width=1500,height=200)
rect = alt.Chart(agg_means_df.query('beta_sheet == True')).mark_rect(color='black').encode(
alt.X('site:N',axis=None),
alt.Y('beta_sheet',axis=None)
).properties(width=1500,height=10)
blah = alt.vconcat(rect,chart,padding=1).resolve_scale(y='independent',x='shared')
blah.display()
return chart
entry_by_site_plot = entry_by_site(func_scores_E3)
#entry_by_site_plot.display()
In [19]:
def entry_by_site_receptor(df,config_name):
tmp_df = df.groupby('site')['effect'].sum().reset_index()
barrel_ranges = {
'Receptor Contact': config_name,
}
#custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df)
.mark_bar(opacity=1)
.encode(
x=alt.X("site:N", axis=alt.Axis(labelAngle=-90)),
y=alt.Y("effect", title="Cell Entry for EFNB3"),
tooltip=["site", "effect"],
#color=alt.Color('region',sort=custom_order),
)
).properties(width=500,height=200)
return chart
entry_by_site_receptor_plot_A = entry_by_site_receptor(func_scores_E3,config['dimer_A'])
entry_by_site_receptor_plot_A.display()
entry_by_site_receptor_plot_B = entry_by_site_receptor(func_scores_E3,config['dimer_B'])
entry_by_site_receptor_plot_B.display()
In [20]:
def entry_by_site_receptor(df):
tmp_df = df.groupby('site')['effect'].median().reset_index()
barrel_ranges = {
'Receptor Contact': config['dimerization'],
}
#custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df)
.mark_bar(opacity=1)
.encode(
x=alt.X("site:N", axis=alt.Axis(labelAngle=-90)),
y=alt.Y("effect", title="Cell Entry for EFNB3"),
tooltip=["site", "effect"],
#color=alt.Color('region',sort=custom_order),
)
).properties(width=500,height=200)
return chart
entry_by_site_receptor_plot = entry_by_site_receptor(func_scores_E3)
entry_by_site_receptor_plot.display()
In [21]:
import pandas as pd
import altair as alt
def entry_by_site_receptor(df):
# Amino acid property mapping
aa_property_mapping = {
"A": "hydrophobic", "V": "hydrophobic", "L": "hydrophobic", "I": "hydrophobic", "M": "hydrophobic",
"Y": "aromatic", "W": "aromatic", "F": "aromatic",
"K": "positive", "R": "positive", "H": "positive",
"E": "negative", "D": "negative",
"S": "hydrophilic", "T": "hydrophilic", "N": "hydrophilic", "Q": "hydrophilic",
"G": "special", "C": "special", "P": "special",
}
# Map amino acid to property and create 'residue_type' column
unique_wildtypes_df = df.drop_duplicates(subset=["site", "wildtype"])
unique_wildtypes_df['residue_type'] = unique_wildtypes_df['wildtype'].map(aa_property_mapping)
tmp_df = df.groupby('site')['effect'].sum().reset_index()
tmp_df = pd.merge(tmp_df, unique_wildtypes_df[['site', 'residue_type']], on='site', how='left')
barrel_ranges = {
#"Stalk": list(range(70, 147)),
#"Neck": list(range(147, 166)),
#"Linker": list(range(166, 178)),
'Receptor Contact': config['contact_sites'],
}
agg_means = []
# For each barrel, filter the tmp_df dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"], 'residue_type': row['residue_type']}
)
agg_means_df = pd.DataFrame(agg_means)
# Update chart to color by 'residue_type'
chart = (
alt.Chart(agg_means_df)
.mark_bar(opacity=1)
.encode(
x=alt.X("site:N", axis=alt.Axis(labelAngle=-90)),
y=alt.Y("effect", title="Summed Cell Entry for EFNB3"),
tooltip=["site", "effect", "residue_type"],
color=alt.Color('residue_type:N', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(title="Wildtype Residue Type"))
)
).properties(width=800, height=200)
return chart
# Example usage, assuming `func_scores_E3` is your input DataFrame
entry_by_site_receptor_plot = entry_by_site_receptor(func_scores_E3)
entry_by_site_receptor_plot.display()
/loc/scratch/38622884/ipykernel_5163/2551888575.py:17: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy unique_wildtypes_df['residue_type'] = unique_wildtypes_df['wildtype'].map(aa_property_mapping)
In [22]:
def entry_by_site_rolling(df):
tmp_df = df.groupby('site')['effect'].median().reset_index()
tmp_df['rolling_sum'] = tmp_df['effect'].rolling(window=10, min_periods=1).median()
barrel_ranges = {
"Stalk": list(range(96, 147)),
"Neck": list(range(148, 165)),
"Linker": list(range(166, 177)),
"Head": list(range(178, 602)),
#"Cysteine": list([158,162]),
#'Receptor Contact': config['contact_sites'],
# "Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["rolling_sum"], "site": row["site"]} # Use rolling_sum for the effect
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df)
.mark_rect(opacity=1, size=25)
.encode(
x=alt.X("site:N", axis=alt.Axis(labelAngle=-90, values=[100, 200, 300, 400, 500, 600],tickCount=6,grid=True)),
y=alt.Y("effect", title="Cell Entry for EFNB3"),
tooltip=["site", "effect"],
color=alt.Color('region', sort=custom_order),
)
).properties(width=800, height=200)
return chart
test_plot = entry_by_site_rolling(func_scores_E3)
test_plot.display()
In [23]:
def make_boxplot_entry_region(df):
barrel_ranges = {
"Beta1-S1": list(range(215, 225)),
"Beta1-S2": list(range(228, 237)),
"Beta1-S3": list(range(244, 257)),
"Beta1-S4": list(range(263, 271)),
"Beta2-S1": list(range(279, 287)),
"Beta2-S2": list(range(290, 297)),
"Beta2-S3": list(range(314, 320)),
"Beta2-S4": list(range(332, 336)),
#"Head": list(range(178, 602)),
#'Receptor Contact': config['contact_sites'],
#"Total": list(range(71, 602)),
}
custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact", "Total"]
empty_charts = []
for selection in ["CHO-EFNB2", "CHO-EFNB3"]:
if selection == "CHO-EFNB2":
effect_name = "EFNB2"
else:
effect_name = "EFNB3"
tmp_df = df[df["cell_type"] == selection]
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = tmp_df[tmp_df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{"region": barrel, "effect": row["effect"], "site": row["site"]}
)
agg_means_df = pd.DataFrame(agg_means)
chart = (
alt.Chart(agg_means_df, title=f"{selection}")
.mark_boxplot(color='darkgray')
.encode(
x=alt.X(
"region:O",
sort=custom_order,
title="RBP Region",
axis=alt.Axis(labelAngle=-90),
),
y=alt.Y(
"effect",
title=f"Cell Entry",
axis=alt.Axis(grid=True, tickCount=4),
),
tooltip=["region", "effect", "site"],
).properties(width=config['width'],height=config['height'])
)
empty_charts.append(chart)
combined_effect_chart = alt.hconcat(*empty_charts).resolve_scale(
y="shared", x="shared", color="independent"
)
return combined_effect_chart
entry_region_boxplot = make_boxplot_entry_region(concat_df)
entry_region_boxplot.display()
In [ ]: